This project aims to explore data on housing prices in an effort to design a model for estimating pricing and other behaviors of the housing market, specifically within King County.

load in the required packages

#load dataset from project folder
suppressWarnings({housing_data <- as.tbl(read.csv("kc_house_data.csv", stringsAsFactors = F))})
#remove unused columns and normalize the data
housing_data_rev <- housing_data[, 3:21]

load in dataset and remove unused features

This dataset is very large, so in order to get a better visual representation of the distribution of the data, a sample of 1000 randomly chosen rows is used instead of the whole dataset (ordered by price, as this is the dependent variable). The sample indexes are selected at runtime every time the script runs, so the data and analysis are different each time the script is run. Because the sampling is random and the dataset is homogenous and static, the 1000 rows chosen should always be representative of the nature of the whole dataset.

#Look at the distributions:
p1 <- plot_ly() %>% add_markers(name = "Pricing",
                                x = 1:length(housing_train_data$price), 
                                y = housing_train_data$price) %>% 
                    layout(title = "King County Housing Feature Distributions")
p2 <- plot_ly() %>% add_markers(name = "Rating v Pricing", 
                                x = housing_train_data$grade, 
                                y = housing_train_data$price)
p3 <- plot_ly() %>% add_markers(name = "Condition v Pricing", 
                                x = housing_train_data$condition, 
                                y = housing_train_data$price)
p4 <- plot_ly() %>% add_markers(name = "Sq. Footage v Pricing", 
                                x = housing_train_data$sqft_lot, 
                                y = housing_train_data$price)
subplot(p1, p2, p3, p4, nrows = 2)

Looking at the distributions of prices over their ordering by magnitude, it appears that the prices of houses that are below $1-2 million follow a linear decline, with all houses above $1-2 million increasing sharply in their prices and bucking the trend. Assuming that the features in this dataset capture most of the factors that influence the pricing of a home, some linear combination of the features could give a reasonable estimate for the pricing of an individual house below $1-2 million. Since the distribution is closer to a power law, two seperate linear functions should give a pretty good estimate by accounting for the ‘knee’ in the pricing distribution. First though, we’ll try fitting a single line to the data and see what kind of error comes out.

#Scale datasets 
housing_train_data_scaled <- cbind(housing_train_data[,1], scale(housing_train_data[,2:7]))
housing_test_data_scaled <- cbind(housing_test_data[,1], scale(housing_test_data[,2:7]))
summary(housing_train_data_scaled)
     price          sqft_living         sqft_lot         condition           grade            yr_built        yr_renovated    
 Min.   :  78000   Min.   :-1.9425   Min.   :-0.3584   Min.   :-3.6637   Min.   :-5.6012   Min.   :-2.4072   Min.   :-0.2145  
 1st Qu.: 324950   1st Qu.:-0.7100   1st Qu.:-0.2460   1st Qu.:-0.6249   1st Qu.:-0.5509   1st Qu.:-0.6765   1st Qu.:-0.2145  
 Median : 452000   Median :-0.1803   Median :-0.1809   Median :-0.6249   Median :-0.5509   Median : 0.1379   Median :-0.2145  
 Mean   : 545343   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
 3rd Qu.: 649612   3rd Qu.: 0.5116   3rd Qu.:-0.1033   3rd Qu.: 0.8945   3rd Qu.: 0.2908   3rd Qu.: 0.8844   3rd Qu.:-0.2145  
 Max.   :7700000   Max.   :12.3818   Max.   :41.0458   Max.   : 2.4138   Max.   : 4.4994   Max.   : 1.4953   Max.   : 4.7073  
summary(housing_test_data_scaled)
     price          sqft_living         sqft_lot         condition           grade            yr_built        yr_renovated    
 Min.   :  83000   Min.   :-1.8818   Min.   :-0.4441   Min.   :-3.6826   Min.   :-3.9686   Min.   :-2.5008   Min.   :-0.1987  
 1st Qu.: 320000   1st Qu.:-0.6862   1st Qu.:-0.3053   1st Qu.:-0.6176   1st Qu.:-0.5599   1st Qu.:-0.6673   1st Qu.:-0.1987  
 Median : 445000   Median :-0.1674   Median :-0.2243   Median :-0.6176   Median :-0.5599   Median : 0.1284   Median :-0.1987  
 Mean   : 533756   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
 3rd Qu.: 631719   3rd Qu.: 0.5319   3rd Qu.:-0.1155   3rd Qu.: 0.9149   3rd Qu.: 0.2923   3rd Qu.: 0.8895   3rd Qu.:-0.1987  
 Max.   :3278000   Max.   : 5.9343   Max.   :13.6084   Max.   : 2.4474   Max.   : 3.7010   Max.   : 1.4776   Max.   : 5.0703  
require(tensorflow)

#split data for training
#test general linear model on relevant quantitative data

#Test on a nueral network of 10 nodes


X <- tf$placeholder(tf$float32, shape(NULL, 7L))
Y <- tf$placeholder(tf$float32, shape(NULL, 1L))



W1 <- tf$Variable(tf$zeros(shape(10, 7)))
b1 <- tf$Variable(tf$zeros(shape(10, 1)))
W2 <- tf$Variable(tf$zeros(shape(1, 10)))
b2 <- tf$Variable(tf$zeros(shape(1, 1)))

A1 <- tf$nn$relu(tf$add(tf$matmul(X,W1), b1))
A2 <- tf$nn$relu(tf$add(tf$matmul(A1,W2), b2))

cost <- tf
tf$placeholder(tf$float32, shape(7L, NULL))

housing_train_price <-  housing_train_data[, "price"]
housing_test_price <- housing_test_data[, "price"]


glm_mod <- rxFastLinear(price ~ sqft_living + grade + condition, data = housing_train_data_scaled, type = "regression", verbose = 0)


price_mod <- rxPredict(glm_mod, data = housing_test_data_scaled, verbose = 0, extraVarsToWrite = c("sqft_living", "grade","condition"))

 xvals <- 1:length(price_mod$Score)

price_est_plotly <- plot_ly() %>% 
  add_markers(name = "Test Set Price", x = xvals, y = housing_test_price$price) %>%
  add_markers(name = "Linear Model est", x = xvals, y = price_mod$Score) %>%
  add_lines(name = "Magnitude Difference", x = xvals, y = price_mod$Score - housing_test_price$price) %>%
  layout(title = "Attempting Linear Regression Model
                  </br> for Pricing Homes in King County")

#rxLinePlot(Score ~ sqft_living + grade + condition, type = c("p", "r"), data = price_mod)
price_est_plotly
#summary(glm_mod)

Given that the distribution of prices was already known to follow a non-linear distribution, the result of this linear estimate isn’t very surprising. The linear fit may have been within a few hundred thousand dollars for the long tail, but the whole seems to be corrupted in trying to fit the collection of values above the elbow. Below around $1 million dollars, the housing prices are pretty evenly distributed, but as the prices increase they start jumping up by orders of magnitude. Lets try the same approach for all values of price < $600K

housing_train_data_rev1 <- filter(housing_train_data_scaled, price < 8e5)
housing_train_price_rev1 <- filter(housing_train_price, price < 8e5)
housing_test_data_rev1 <- filter(housing_test_data_scaled, price < 8e5)
housing_test_price_rev1 <- filter(housing_test_price, price < 8e5)
max_test_rev1 <- apply(housing_test_data_rev1, FUN = max, MARGIN = 2)
min_test_rev1 <- apply(housing_test_data_rev1, FUN = min, MARGIN = 2)
mean_test_rev1 <- apply(housing_test_data_rev1, FUN = mean, MARGIN = 2)


glm_mod_rev1 <- rxFastLinear(price ~ sqft_living + grade + condition, data = housing_train_data_rev1, type = "regression", verbose = 0)


price_mod_rev1 <- rxPredict(glm_mod_rev1, data = housing_test_data_rev1, verbose = 0, extraVarsToWrite = c("sqft_living", "grade","condition"))
xvals_rev1 <- 1:length(price_mod_rev1$Score)

price_est_plotly_rev1 <- plot_ly() %>% 
  add_markers(name = "Test Set Price", x = xvals_rev1, y = housing_test_price_rev1$price) %>%
  add_markers(name = "Linear Model est", x = xvals_rev1, y = price_mod_rev1$Score) %>%
  add_lines(name = "Magnitude Difference", x = xvals_rev1, y = price_mod_rev1$Score - housing_test_price_rev1$price) %>%
  layout(title = "Attempting Linear Regression Model
                  </br>for Pricing Homes in King County
                  </br>Where Price of Home < $800k")

price_est_plotly_rev1
#summary(glm_mod_rev1)

By Cutting off the less linear part of the dataset, the maxerror between the estimate and the actual price was decreased.

---
title: "King County Housing Prices"
author: "Duncan McKinnon"
output: html_notebook
---

This project aims to explore data on housing prices in an effort to design a model for estimating pricing and other behaviors of the housing market, specifically within King County. 


```{r message = F, warning = F, echo = F, strip.white = F, tidy = T}

suppressPackageStartupMessages(
  {
    #install or load required packages into the workspace
    package_list <- c('tidyverse', 'knitr', 'plotly','RColorBrewer','sqldf', 'stringr','tensorflow')
    non_installed <- package_list[!(package_list %in% installed.packages()[,"Package"])]
    if(length(non_installed)) install.packages(non_installed)
    require('dplyr')
    require('plotly')
    require('RColorBrewer')
    require('sqldf')
    require('knitr')
    require('stringr')
    require('tensorflow')
  }
)
knitr::opts_chunk$set(message = F, warning = F, strip.white = F, tidy = T)
```
load in the required packages


```{r}
#load dataset from project folder
suppressWarnings({housing_data <- as.tbl(read.csv("kc_house_data.csv", stringsAsFactors = F))})


#remove unused columns and normalize the data
housing_data_rev <- housing_data[, 3:21]


#using dplyr sampling to collect and random samplings
housing_train_data <- housing_data_rev %>% select(c(1,4,5,9,10,13,14)) %>% sample_n(10000) %>%
                                           arrange(desc(price))
                                                   
housing_test_data <- housing_data_rev %>% select(c(1,4,5,9,10,13,14)) %>% sample_n(2000) %>%
                                          arrange(desc(price))
                                                    

price_stats <- list("train_mean" = mean(housing_train_data$price), 
                    "train_SD" = sd(housing_train_data$price),
                    "test_mean" = mean(housing_test_data$price),
                    "test_SD" = sd(housing_test_data$price))

price_stats
```
load in dataset and remove unused features


This dataset is very large, so in order to get a better visual representation of the distribution of the data, a sample of 1000 randomly chosen rows is used instead of the whole dataset (ordered by price, as this is the dependent variable).  The sample indexes are selected at runtime every time the script runs, so the data and analysis are different each time the script is run. Because the sampling is random and the dataset is homogenous and static, the 1000 rows chosen should always be representative of the nature of the whole dataset.


```{r}
#Look at the distributions:
p1 <- plot_ly() %>% add_markers(name = "Pricing",
                                x = 1:length(housing_train_data$price), 
                                y = housing_train_data$price) %>% 
                    layout(title = "King County Housing Feature Distributions")
p2 <- plot_ly() %>% add_markers(name = "Rating v Pricing", 
                                x = housing_train_data$grade, 
                                y = housing_train_data$price)
p3 <- plot_ly() %>% add_markers(name = "Condition v Pricing", 
                                x = housing_train_data$condition, 
                                y = housing_train_data$price)
p4 <- plot_ly() %>% add_markers(name = "Sq. Footage v Pricing", 
                                x = housing_train_data$sqft_lot, 
                                y = housing_train_data$price)

subplot(p1, p2, p3, p4, nrows = 2)
```
Looking at the distributions of prices over their ordering by magnitude, it appears that the prices of houses that are below \$1-2 million follow a linear decline, with all houses above \$1-2 million increasing sharply in their prices and bucking the trend.  Assuming that the features in this dataset capture most of the factors that influence the pricing of a home, some linear combination of the features could give a reasonable estimate for the pricing of an individual house below \$1-2 million.  Since the distribution is closer to a power law, two seperate linear functions should give a pretty good estimate by accounting for the 'knee' in the pricing distribution.  First though, we'll try fitting a single line to the data and see what kind of error comes out.

```{r message = F, warning = F}
#Scale datasets 
housing_train_data_scaled <- cbind(housing_train_data[,1], scale(housing_train_data[,2:7]))
housing_test_data_scaled <- cbind(housing_test_data[,1], scale(housing_test_data[,2:7]))

summary(housing_train_data_scaled)
summary(housing_test_data_scaled)
```

```{r message = F, warning = F}
require(tensorflow)

#split data for training
#test general linear model on relevant quantitative data

#Test on a nueral network of 10 nodes


X <- tf$placeholder(tf$float32, shape(NULL, 7L))
Y <- tf$placeholder(tf$float32, shape(NULL, 1L))



W1 <- tf$Variable(tf$zeros(shape(10, 7)))
b1 <- tf$Variable(tf$zeros(shape(10, 1)))
W2 <- tf$Variable(tf$zeros(shape(1, 10)))
b2 <- tf$Variable(tf$zeros(shape(1, 1)))

A1 <- tf$nn$relu(tf$add(tf$matmul(X,W1), b1))
A2 <- tf$nn$relu(tf$add(tf$matmul(A1,W2), b2))

cost <- tf
```


```{r message = F, warning = F}
tf$placeholder(tf$float32, shape(7L, NULL))

housing_train_price <-  housing_train_data[, "price"]
housing_test_price <- housing_test_data[, "price"]


glm_mod <- rxFastLinear(price ~ sqft_living + grade + condition, data = housing_train_data_scaled, type = "regression", verbose = 0)


price_mod <- rxPredict(glm_mod, data = housing_test_data_scaled, verbose = 0, extraVarsToWrite = c("sqft_living", "grade","condition"))

 xvals <- 1:length(price_mod$Score)

price_est_plotly <- plot_ly() %>% 
  add_markers(name = "Test Set Price", x = xvals, y = housing_test_price$price) %>%
  add_markers(name = "Linear Model est", x = xvals, y = price_mod$Score) %>%
  add_lines(name = "Magnitude Difference", x = xvals, y = price_mod$Score - housing_test_price$price) %>%
  layout(title = "Attempting Linear Regression Model
                  </br> for Pricing Homes in King County")

#rxLinePlot(Score ~ sqft_living + grade + condition, type = c("p", "r"), data = price_mod)
price_est_plotly
#summary(glm_mod)
```
Given that the distribution of prices was already known to follow a non-linear distribution, the result of this linear estimate isn't very surprising.  The linear fit may have been within a few hundred thousand dollars for the long tail, but the whole seems to be corrupted in trying to fit the collection of values above the elbow.  Below around \$1 million dollars, the housing prices are pretty evenly distributed, but as the prices increase they start jumping up by orders of magnitude.  Lets try the same approach for all values of price < $600K

```{r}
housing_train_data_rev1 <- filter(housing_train_data_scaled, price < 8e5)
housing_train_price_rev1 <- filter(housing_train_price, price < 8e5)
housing_test_data_rev1 <- filter(housing_test_data_scaled, price < 8e5)
housing_test_price_rev1 <- filter(housing_test_price, price < 8e5)
max_test_rev1 <- apply(housing_test_data_rev1, FUN = max, MARGIN = 2)
min_test_rev1 <- apply(housing_test_data_rev1, FUN = min, MARGIN = 2)
mean_test_rev1 <- apply(housing_test_data_rev1, FUN = mean, MARGIN = 2)


glm_mod_rev1 <- rxFastLinear(price ~ sqft_living + grade + condition, data = housing_train_data_rev1, type = "regression", verbose = 0)


price_mod_rev1 <- rxPredict(glm_mod_rev1, data = housing_test_data_rev1, verbose = 0, extraVarsToWrite = c("sqft_living", "grade","condition"))
xvals_rev1 <- 1:length(price_mod_rev1$Score)

price_est_plotly_rev1 <- plot_ly() %>% 
  add_markers(name = "Test Set Price", x = xvals_rev1, y = housing_test_price_rev1$price) %>%
  add_markers(name = "Linear Model est", x = xvals_rev1, y = price_mod_rev1$Score) %>%
  add_lines(name = "Magnitude Difference", x = xvals_rev1, y = price_mod_rev1$Score - housing_test_price_rev1$price) %>%
  layout(title = "Attempting Linear Regression Model
                  </br>for Pricing Homes in King County
                  </br>Where Price of Home < $800k")

price_est_plotly_rev1
#summary(glm_mod_rev1)
```
By Cutting off the less linear part of the dataset, the maxerror between the estimate and the actual price was decreased.
```{r message = F, warning = F}



```















